Manual curation trials

Published

July 14, 2025

Comparing results of different TEammo curators

Run BLASTs between curated libraries

mkdir -p results

# Make the blastdbs - they are output in same folder as the fasta input
conda activate MCHelper
while IFS=',' read -r species strain marta dean; do
  makeblastdb -in ${marta} -dbtype "nucl"
done < <(sed '1d' curated_libs.csv)

# Run BLASTS - reciprocal not needed
while IFS=',' read -r species strain marta dean; do
  blastn -query ${dean} -db ${marta} -outfmt 6 -max_hsps 1 -out results/${species}_${strain}_dean_vs_marta.blast.out
done < <(sed '1d' curated_libs.csv)

Visualisation

if(!require("BiocManager", character.only = T)) install.packages("BiocManager")
pkgs.list <- readLines("r-requirements.txt")

for (i in 1:length(pkgs.list)){
  if(!require(pkgs.list[i], character.only = T)){
    BiocManager::install(pkgs.list[i], Ncpus = ceiling(parallel::detectCores()/1.7))
    require(pkgs.list[i], character.only = TRUE)
  }else{require(pkgs.list[1],character.only = TRUE)}
}

Load and overall comparisons

setwd("/home/csic/gcy/jgp/extra_storage/dean/mctrials/mctrials")
source("scripts/mccompare_functions.R")

# Color schemes

palette_seq_match <- c(
  "Missing from query lib" = "black",                     # Clear and bold
  "Missing from subject lib" = "grey50",                  # Distinct but neutral
  "Present, 70" = RColorBrewer::brewer.pal(3, "Blues")[2],  # Medium blue
  "Present, 80" = RColorBrewer::brewer.pal(3, "Blues")[3],  # Darker blue
  "Perfect, 95-100" = RColorBrewer::brewer.pal(3, "Greens")[3]  # Bright green
)

# Load the TE classifications
teClassification <- read.table("orozco_classification-2024_mchelper.csv", sep = ";", header = TRUE)
teClassification <- teClassification %>%
  mutate(AppName = paste(Class, Order, Superfamily, sep ="/") %>% sub("/$", "", .) %>% sub("/$", "", .))
teClassification <- rbind(
  teClassification
  , c("", "Unclassified", "", "Unclassified")
)

# Load libraries to compare

# Drosophila birchii
lib_Dbir_marta <- teReadLib(
  "../../data/final_libraries/D.birchii/D.birchii.TE_library.fasta",
  libIdentifier = "Marta",
  species = "D.birchii",
  strain = "v1.0_00_GenBank")

lib_Dbir_dean <- teReadLib(
  "data/MCH_output/D.birchii/v1.0_00_GenBank/D.birchii_v1.0_00_GenBank_curated-TE-library.fa",
  libIdentifier = "Dean",
  species = "D.birchii",
  strain = "v1.0_00_GenBank")

# Drosophila merina
lib_Dmer_marta <- teReadLib(
  "../../data/final_libraries/D.merina/D.merina.TE_library.fasta",
  libIdentifier = "Marta",
  species = "D.merina", strain = "NA")

lib_Dmer_dean <- teReadLib(
  "data/MCH_output/D.merina/NA/D.merina_NA_curated-TE-library.fa",
  libIdentifier = "Dean",
  species = "D.merina", strain = "NA")

# Drosophila santomea
lib_Dsan_marta <- teReadLib(
  "../../data/final_libraries/D.birchii/D.birchii.TE_library.fasta",
  libIdentifier = "Marta",
  species = "D.santomea",
  strain = "STO_CAGO_1482_RefSeq")

lib_Dsan_dean <- teReadLib(
  "data/MCH_output/D.santomea/STO_CAGO_1482_RefSeq/D.santomea_STO_CAGO_1482_RefSeq_curated-TE-library.fa",
  libIdentifier = "Dean",
  species = "D.santomea",
  strain = "STO_CAGO_1482_RefSeq")


# Drosophila tristis
lib_Dtri_marta <- teReadLib(
  "../../data/final_libraries/D.tristis/D.tristis.TE_library.fasta",
  libIdentifier = "Marta",
  species = "D.tristis", strain = "nanopore_D2")

lib_Dtri_dean <- teReadLib(
  "data/MCH_output/D.tristis/nanopore_D2/D.tristis_nanopore_D2_curated-TE-library.fa",
  libIdentifier = "Dean",
  species = "D.tristis", strain = "nanopore_D2")
# Load the blast results comparing curated libraries

blast_Dbir_dean_vs_marta <- LoadBlastComparison(
  blast_out = "results/D.birchii_v1.0_00_GenBank_dean_vs_marta.blast.out",
  blast_query_lib = lib_Dbir_dean,
  blast_subject_lib = lib_Dbir_marta,
  species = "D.birchii",
  strain = "v1.0_00_GenBank",
  comparison = "dean_vs_marta")
   
blast_Dmer_dean_vs_marta <- LoadBlastComparison(
  blast_out = "results/D.merina_NA_dean_vs_marta.blast.out",
  blast_query_lib = lib_Dmer_dean,
  blast_subject_lib = lib_Dmer_marta,
  species = "D.merina",
  strain = "NA",
  comparison = "dean_vs_marta")

blast_Dsan_dean_vs_marta <- LoadBlastComparison(
  blast_out = "results/D.santomea_STO_CAGO_1482_RefSeq_dean_vs_marta.blast.out",
  blast_query_lib = lib_Dsan_dean,
  blast_subject_lib = lib_Dsan_marta,
  species = "D.santomea",
  strain = "STO_CAGO_1482_RefSeq",
  comparison = "dean_vs_marta")

blast_Dtri_dean_vs_marta <- LoadBlastComparison(
  blast_out = "results/D.tristis_nanopore_D2_dean_vs_marta.blast.out",
  blast_query_lib = lib_Dtri_dean,
  blast_subject_lib = lib_Dtri_marta,
  species = "D.tristis",
  strain = "nanopore_D2",
  comparison = "dean_vs_marta")

# Bring them all together for overall
blast_all <- bind_rows(
  blast_Dbir_dean_vs_marta,
  blast_Dmer_dean_vs_marta,
  blast_Dsan_dean_vs_marta,
  blast_Dtri_dean_vs_marta
)

How do the libraries compare overall?

tePlotLib(list(lib_Dbir_dean, lib_Dbir_marta))
tePlotLib(list(lib_Dmer_dean, lib_Dmer_marta))
tePlotLib(list(lib_Dsan_dean, lib_Dsan_marta))
tePlotLib(list(lib_Dtri_dean, lib_Dtri_marta))

How often do the curators agree based on BLASTn sequence identity between their final curated libraries?

PlotBlastBarMatches(blast_all)
Figure 1: TE query library (current curator, Dean) vs subject library (previous curator, Marta) by BLASTn hit totals (A), percent by TE classification (B), and by total quantites of TE consensuses missing from the query library (C), also shown as classification None/None/None in A & B. BLASTn hits of TE consensus library categorised by BLAST percent identity (pident): Perfect, 95-100 %; Present 80, 80-94 %; Present 70, 70-79 %; Missing, 69 % > - subdivided by missing from query or subject library
PlotBlastBarMatches(blast_Dbir_dean_vs_marta)
Figure 2: TE query library (current curator, Dean) vs subject library (previous curator, Marta) by BLASTn hit totals (A), percent by TE classification (B), and by total quantites of TE consensuses missing from the query library (C), also shown as classification None/None/None in A & B. BLASTn hits of TE consensus library categorised by BLAST percent identity (pident): Perfect, 95-100 %; Present 80, 80-94 %; Present 70, 70-79 %; Missing, 69 % > - subdivided by missing from query or subject library
PlotBlastBarMatches(blast_Dmer_dean_vs_marta)
Figure 3: TE query library (current curator, Dean) vs subject library (previous curator, Marta) by BLASTn hit totals (A), percent by TE classification (B), and by total quantites of TE consensuses missing from the query library (C), also shown as classification None/None/None in A & B. BLASTn hits of TE consensus library categorised by BLAST percent identity (pident): Perfect, 95-100 %; Present 80, 80-94 %; Present 70, 70-79 %; Missing, 69 % > - subdivided by missing from query or subject library
PlotBlastBarMatches(blast_Dsan_dean_vs_marta)
Figure 4: TE query library (current curator, Dean) vs subject library (previous curator, Marta) by BLASTn hit totals (A), percent by TE classification (B), and by total quantites of TE consensuses missing from the query library (C), also shown as classification None/None/None in A & B. BLASTn hits of TE consensus library categorised by BLAST percent identity (pident): Perfect, 95-100 %; Present 80, 80-94 %; Present 70, 70-79 %; Missing, 69 % > - subdivided by missing from query or subject library
PlotBlastBarMatches(blast_Dtri_dean_vs_marta)
Figure 5: TE query library (current curator, Dean) vs subject library (previous curator, Marta) by BLASTn hit totals (A), percent by TE classification (B), and by total quantites of TE consensuses missing from the query library (C), also shown as classification None/None/None in A & B. BLASTn hits of TE consensus library categorised by BLAST percent identity (pident): Perfect, 95-100 %; Present 80, 80-94 %; Present 70, 70-79 %; Missing, 69 % > - subdivided by missing from query or subject library

How often do curators agree on TE classifications?

PlotBlastTileMatches(blast_all)
Figure 6: TE query library (current curator, Dean) vs subject library (previous curator, Marta) by comparing TE curator-assigned classifications between query (x axis, Dean) TE consensus and its best BLASTn hit in the subject (y axis, Marta) library. Facets of x axis show: BLASTn hits of TE consensus library categorised by BLAST percent identity (pident): Perfect, 95-100 %; Present 80, 80-94 %; Present 70, 70-79 %; Missing, 69 % > - subdivided by missing from query or subject library (not shown). Facets of y axis show: level of TE classification agreement e.g. Class, Family, Subfamily indicates that the two curators classified the TE exactly the same, whereas Class, Family means they differed on Subclass assignment - ‘None’ means each curator classified the TE as different Class, Family, and Subfamily!
PlotBlastTileMatches(blast_Dbir_dean_vs_marta)
Figure 7: TE query library (current curator, Dean) vs subject library (previous curator, Marta) by comparing TE curator-assigned classifications between query (x axis, Dean) TE consensus and its best BLASTn hit in the subject (y axis, Marta) library. Facets of x axis show: BLASTn hits of TE consensus library categorised by BLAST percent identity (pident): Perfect, 95-100 %; Present 80, 80-94 %; Present 70, 70-79 %; Missing, 69 % > - subdivided by missing from query or subject library (not shown). Facets of y axis show: level of TE classification agreement e.g. Class, Family, Subfamily indicates that the two curators classified the TE exactly the same, whereas Class, Family means they differed on Subclass assignment - ‘None’ means each curator classified the TE as different Class, Family, and Subfamily!
PlotBlastTileMatches(blast_Dmer_dean_vs_marta)
Figure 8: TE query library (current curator, Dean) vs subject library (previous curator, Marta) by comparing TE curator-assigned classifications between query (x axis, Dean) TE consensus and its best BLASTn hit in the subject (y axis, Marta) library. Facets of x axis show: BLASTn hits of TE consensus library categorised by BLAST percent identity (pident): Perfect, 95-100 %; Present 80, 80-94 %; Present 70, 70-79 %; Missing, 69 % > - subdivided by missing from query or subject library (not shown). Facets of y axis show: level of TE classification agreement e.g. Class, Family, Subfamily indicates that the two curators classified the TE exactly the same, whereas Class, Family means they differed on Subclass assignment - ‘None’ means each curator classified the TE as different Class, Family, and Subfamily!
PlotBlastTileMatches(blast_Dsan_dean_vs_marta)
Figure 9: TE query library (current curator, Dean) vs subject library (previous curator, Marta) by comparing TE curator-assigned classifications between query (x axis, Dean) TE consensus and its best BLASTn hit in the subject (y axis, Marta) library. Facets of x axis show: BLASTn hits of TE consensus library categorised by BLAST percent identity (pident): Perfect, 95-100 %; Present 80, 80-94 %; Present 70, 70-79 %; Missing, 69 % > - subdivided by missing from query or subject library (not shown). Facets of y axis show: level of TE classification agreement e.g. Class, Family, Subfamily indicates that the two curators classified the TE exactly the same, whereas Class, Family means they differed on Subclass assignment - ‘None’ means each curator classified the TE as different Class, Family, and Subfamily!
PlotBlastTileMatches(blast_Dtri_dean_vs_marta)
Figure 10: TE query library (current curator, Dean) vs subject library (previous curator, Marta) by comparing TE curator-assigned classifications between query (x axis, Dean) TE consensus and its best BLASTn hit in the subject (y axis, Marta) library. Facets of x axis show: BLASTn hits of TE consensus library categorised by BLAST percent identity (pident): Perfect, 95-100 %; Present 80, 80-94 %; Present 70, 70-79 %; Missing, 69 % > - subdivided by missing from query or subject library (not shown). Facets of y axis show: level of TE classification agreement e.g. Class, Family, Subfamily indicates that the two curators classified the TE exactly the same, whereas Class, Family means they differed on Subclass assignment - ‘None’ means each curator classified the TE as different Class, Family, and Subfamily!

TE size distribution

lib1 <- width(lib_Dtri_dean) %>%
  data.frame(length = .)
lib1$curator <- "dean"

lib2 <- width(lib_Dtri_marta) %>%
  data.frame(length = .)
lib2$curator <- "marta"

bind_rows(lib1, lib2) %>%
    ggplot(aes(x = length)) +
      geom_density(aes(fill = curator), alpha = 0.5, color = "black") +
      labs(title = "Density Plot of DNA Sequence Lengths",
           x = "Sequence Length (bp)",
           y = "Density") +
      theme_minimal()